比对是最耗资源和时间的一步
work_dir=/home4/sjshen/project/m6a/GSE29714_Cell
fastq_cut_dir=$work_dir/0.2_fastq_cut/mouse
hisat2_dir=$work_dir/1.1_hisat2/mouse
hisat2_index=/home/share/hisat2_index
echo $fastq_cut_dir
cd $fastq_cut_dir
species=mm10_tran
thread=5
#for Single-end data
for i in *_cut.fastq.gz
do
echo $i
(nohup hisat2 -p $thread --dta-cufflinks --no-discordant --no-mixed -t -x $hisat2_index/new_index/$species -U $i -S $hisat2_dir/${i%%_cut.fastq.gz}.hisat2.sam --no-unal > $hisat2_dir/${i%%_cut.fastq.gz}.hisat2.log) &
done
echo $hisat2_dir
cd $hisat2_dir
Tips1:hisat2有官方提供的比对基因组,我们也可以自己建立,需要的同学可以答疑时咨询我们老师
Tips2:hisat2默认输出一条/一对reads的最佳5处匹配位置
#for Pair-end data
for i in *_1_cut.fastq.gz
do
echo $i
t=${i/_1_cut.fastq.gz/_2_cut.fastq.gz}
echo $t
(nohup hisat2 -p $thread --dta-cufflinks --no-discordant --no-mixed -t -x $hisat2_index/new_index/$species -1 $i -2 $t -S $hisat2_dir/${i%%_1_cut.fastq.gz}.hisat2.sam --no-unal > $hisat2_dir/${i%%_1_cut.fastq.gz}.hisat2.log) &
done
work_dir=/home4/sjshen/project/m6a/GSE29714_Cell
hisat2_dir=$work_dir/1.1_hisat2
echo $hisat2_dir
cd $hisat2_dir
mkdir position_sorted_bam
#--------------------------------------------------------------------
#transfer sam to position_sorted_bam
for i in *.sam
do
echo $i
( nohup samtools sort -@ 5 -o ./position_sorted_bam/${i%%.sam}.position_sorted.bam $i ) &
done
#--------------------------------------------------------------------
#build bam index for position_sorted_bam
for i in ./position_sorted_bam/*.bam
do
( nohup samtools index $i ) &
done
work_dir=/home4/sjshen/project/m6a/GSE29714_Cell
hisat2_dir=$work_dir/1.1_hisat2
echo $hisat2_dir
cd $hisat2_dir
echo -e 'Sample\ttotal_reads(million)\taligned exactly 1 time\tpercentage\taligned >1 times\tpercentage\ttotal_percentage' > hisat2_align_summary.tab
for i in *.log
do
echo $i
awk -v sample=${i%%.hisat2.log} 'BEGIN{FS="[ \t();%]+" ; OFS="\t"} NR==5 {t=$2} NR==7 {a=$2;b=$3} NR==8 {c=$2;d=$3} END{print sample,t/1e6,a/1e6,b"%",c/1e6,d"%",(b+d)"%"}' $i >> hisat2_align_summary.tab
done
Sample | total_reads(million) | aligned exactly 1 time | percentage | aligned >1 times | percentage | total_percentage |
---|---|---|---|---|---|---|
C57BL6_Brain_Sample_1_MeRIP-SYSY | 37.755 | 25.8406 | 68.44% | 5.81707 | 15.41% | 83.85% |
C57BL6_Brain_Sample_1_Non-IP_Control | 36.7711 | 21.7639 | 59.19% | 8.96879 | 24.39% | 83.58% |
C57BL6_Brain_Sample_2_MeRIP-NEB | 35.1339 | 28.4138 | 80.87% | 2.80259 | 7.98% | 88.85% |
C57BL6_Brain_Sample_2_MeRIP-SYSY | 43.9483 | 34.9316 | 79.48% | 4.76865 | 10.85% | 90.33% |
C57BL6_Brain_Sample_2_Non-IP_Control | 41.4148 | 21.9999 | 53.12% | 12.0241 | 29.03% | 82.15% |
Tips: hisat2输出的SAM文件中 'NHN' The number of mapped locations for the read or the pair.
bam_dir=$hisat2_dir/position_sorted_bam
deeptools_bw_dir=$bam_dir/deeptools_bw
mkdir $deeptools_bw_dir
cd $deeptools_bw_dir
for i in $bam_dir/*.position_sorted.bam
do
echo $i
t=${i##*/}
name=${t%%.position_sorted.bam}
nohup bamCoverage -p 10 -b $i -o ${name}.bw --normalizeUsing RPKM --binSize 10 --smoothLength 30 > ./${name}.bw.log &
done
Tips1:deeptools是NGS比对后分析的四大金刚之一(samtools/bedtools/deeptools/bamtools),后面还会用到他
Tips2:bamCoverage是deeptools工具套件的其中一个函数,产固定步长的bigwig效果很好,他有一个姐妹函数bamCompare后面会讲
B字三剑客 “bam、bigwig、bed” ,三人足以横扫天下,后面会介绍
work_dir=/home4/sjshen/project/m6a/GSE29714_Cell
fastq_cut_dir=$work_dir/0.2_fastq_cut/mouse
bwa_dir=$work_dir/1.2_bwa/mouse
echo $fastq_cut_dir
cd $fastq_cut_dir
species=/home/share/bwa_index/mm10.fa
thread=5
#for Single-end data
for i in *_cut.fastq.gz
do
echo $i
( nohup bwa mem -t $thread $species $i -M -R '@RG\tID:sample\tSM:sample' -o $bwa_dir/${i%%_cut.fastq.gz}.bwa.sam > $bwa_dir/${i%%_cut.fastq.gz}.bwa.log 2>&1 ) &
done
echo $bwa_dir
cd $bwa_dir
Tips1:bwa需要自己建立比对基因组,需要的同学可以答疑时咨询我们老师
Tips2:bwa不论比对质量都会输出,需要后续筛选
#for Pair-end data
for i in *1_cut.fastq.gz
do
echo $i
t=${i/1_cut.fastq.gz/2_cut.fastq.gz}
( nohup bwa mem -t $thread $species $i $t -M -R '@RG\tID:sample\tSM:sample' -o $bwa_dir/${i%%_1_cut.fastq.gz}.bwa.sam > $bwa_dir/${i%%_1_cut.fastq.gz}.bwa.log 2>&1 ) &
done
## 1.sam to bam ##
for i in *.sam
do
echo $i
( nohup samtools sort -@ 5 -o ${i%%.sam}.position_sorted.bam $i ) &
done
#--------------------------------------------------------------------
## 2.filter ##
MAPQ=20
flag=2 #only for pair-end sequencing
for i in *.position_sorted.bam
do
echo $i
nohup samtools view -@ 10 -hb -f $flag -q $MAPQ $i -o ${i%%.bam}.MAPQ${MAPQ}.bam &
done
#--------------------------------------------------------------------
## 3.index ##
for i in *MAPQ*.bam
do
echo $i
mv $i ${i%%.position_sorted*}.final.bam
done
for i in *.final.bam
do
( nohup samtools index $i ) &
done
work_dir=/home4/sjshen/project/m6a/GSE29714_Cell
fastq_cut_dir=$work_dir/0.2_fastq_cut/mouse
bwa_dir=$work_dir/1.2_bwa/mouse
mkdir $bwa_dir/flagstat
#--------------------------------------------------------------------
# step1- create flagstat from bam files
for i in *.bam
do
echo $i
samtools flagstat -@ 10 $i > ./flagstat/${i/.bam/.flagstat} &
done
#--------------------------------------------------------------------
# step2 - summary from flagstat files
cd $bwa_dir/flagstat
echo -e 'Sample\tTotal mapped fragments(M)\tfilter_keep(M)\tpercentage\tduplicate(M)\tpercentage\tfinal_keep(M)\tpercentage' > flag_state.tab
for i in *.bwa.position_sorted.flagstat
do
name=${i%%.bwa.position_sorted.flagstat}
echo $name
awk -v name=$name '
BEGIN{FS="[ \t();%]+";OFS="\t"}
FILENAME==name".bwa.position_sorted.flagstat" && FNR==5 {total=t=($1/2)/1e6}
FILENAME==name".bwa.position_sorted.MAPQ20.flagstat" && FNR==5 {filter_keep=t=($1/2)/1e6;filter_keep_p=filter_keep/total*100}
END{print name,total,filter_keep,filter_keep_p"%"}' ${name}.bwa.position_sorted.flagstat ${name}.bwa.position_sorted.MAPQ20.flagstat \
>> bwa_align_summary.tab
done
Sample | Total_mapped fragments(M) | filter_keep(M) | percentage | duplicate(M) | percentage | final_keep(M) | percentage |
---|---|---|---|---|---|---|---|
C57BL6_Brain_Sample_1_MeRIP-SYSY | 32.8287 | 23.9329 | 72.9024% | 6.61624 | 20.1538% | 17.3166 | 52.7485% |
C57BL6_Brain_Sample_1_Non-IP_Control | 31.0159 | 19.873 | 64.0737% | 7.51592 | 24.2325% | 12.3571 | 39.8412% |
C57BL6_Brain_Sample_2_MeRIP-NEB | 33.1795 | 27.2499 | 82.1288% | 7.68763 | 23.1698% | 19.5623 | 58.959% |
C57BL6_Brain_Sample_2_MeRIP-SYSY | 41.8398 | 32.5188 | 77.7223% | 13.0959 | 31.3001% | 19.4229 | 46.4222% |
C57BL6_Brain_Sample_2_Non-IP_Control | 38.0361 | 18.1338 | 47.6753% | 9.7594 | 25.6583% | 8.37443 | 22.0171% |
# with duplicate
for i in ./*.position_sorted.MAPQ20.bam
do
echo $i
t=${i##*/}
name=${t%%.position_sorted*}
nohup bamCoverage -p 10 -b $i -o ${name}.withdup.bw --normalizeUsing RPKM --binSize 10 --smoothLength 30 > ./${name}.withdup.bw.log &
done
#--------------------------------------------------------------------
# without duplicate
for i in ./*.final.bam
do
echo $i
t=${i##*/}
name=${t%%.final.bam}
nohup bamCoverage -p 10 -b $i -o ${name}.rmdup.bw --normalizeUsing RPKM --binSize 10 --smoothLength 30 > ./${name}.rmdup.bw.log &
done
# step1 - mark
for i in *position_sorted.bam
do
echo $i
sambamba markdup -t 10 $i ${i/.bam/.mark.bam} > ${i/.bam/.mark.log} 2>&1 &
done
#--------------------------------------------------------------------
# step2 - remove
flag=1024
for i in *mark.bam
do
echo $i
nohup samtools view -@ 10 -Shb -F $flag $i -o ${i%%.position_sorted*}.final.bam &
done
为什么选择sambamba?因为他长得像samtools;
为什么选择sambamba?因为他比picard好用,运行稳定;
为什么选择sambamba?因为他和picard一样结果,通吃SE/PE。
。。。还问?不用问了,用就对了!
不过picard的统计还是很丰富的,有时还是有点作用的那到底去不去?两个门派都有,哪个好用就入哪个门派吧